Data Cleaning

Tasks Complete

  • Combine eGFR and weight data sets
  • Clean variable values
    • Pregnant (Yes, No)
    • Season (Rainy, Dry)
  • Create new variables
    • Monday indicator for measurement date
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
gfr <- read.csv("eGFR.csv")
weight <- read.csv("weight.csv")

weight$Visit <- as.numeric(str_extract(weight$Study.visit, "[0-9]"))
weight$Id <- as.numeric(sapply(str_split(weight$Id, "-"), function(v) v[3]))
weight$Pregnant[which(weight$Pregnant == "")] <- NA
weight$Date <- mdy(weight$Date)

gfr <- left_join(gfr, weight)
## Joining, by = c("Id", "Visit")
gfr$Monday <- wday(gfr$Date, label = TRUE) == "Mon"
gfr <- gfr %>%
    mutate(Season = case_when(Season == "Rainy season" ~ "Rainy", Season == "Dry season" ~ 
        "Dry")) %>%
    mutate(SeasonRainy = if_else(Season == "Rainy", 1, 0))

gfr <- gfr %>%
    mutate(Pregnant = if_else(is.na(Pregnant), "No", Pregnant)) %>%
    mutate(PregnantYes = if_else(Pregnant == "Yes", 1, 0)) %>%
    select(Id, Visit, Sex, Community, Age, eGFR_cre, Months, Height..meters., Weight..kg., 
        BMI, Season, SeasonRainy, Pregnant, PregnantYes, Date, Monday)

head(gfr)
##   Id Visit  Sex Community Age eGFR_cre Months Height..meters. Weight..kg.   BMI
## 1  1     2 Male Tololar 1  22 123.8020      6            1.78        80.3 25.34
## 2  1     6 Male Tololar 1  25 118.3518     36            1.78        78.9 24.90
## 3  1     4 Male Tololar 1  23 116.7217     18            1.78        82.7 26.10
## 4  1     5 Male Tololar 1  24 122.0748     24            1.78        77.6 24.49
## 5  1     7 Male Tololar 1  27 122.9960     48            1.78        79.4 25.06
## 6  1     3 Male Tololar 1  23 128.7687     12            1.78        72.6 22.91
##   Season SeasonRainy Pregnant PregnantYes       Date Monday
## 1    Dry           0       No           0 2015-05-27  FALSE
## 2  Rainy           1       No           0 2017-10-23   TRUE
## 3    Dry           0       No           0 2016-06-03  FALSE
## 4  Rainy           1       No           0 2016-10-17   TRUE
## 5  Rainy           1       No           0 2018-10-22   TRUE
## 6  Rainy           1       No           0 2015-10-29  FALSE

Analytic Subset (n = 301)

  • Individuals with 4 or more observations over time

New variables

  • Baseline eGFR: Mean eGFR of non-pregnant eGFR during Visit 1 and 2
  • Change/Difference from Baseline: eGFR - Baseline eGFR
  • Percent Change from Baseline: (eGFR - Baseline eGFR)/(Baseline eGFR)
gfr_sub <- gfr %>%
    group_by(Id) %>%
    mutate(Count = n()) %>%
    ungroup() %>%
    filter(Count > 3) %>%
    select(-Count)

gfr_sub <- gfr_sub %>%
    filter(Visit == 1 & Pregnant == "No") %>%
    group_by(Id) %>%
    mutate(Baseline = mean(eGFR_cre)) %>%
    ungroup() %>%
    distinct(Baseline, Id) %>%
    right_join(gfr_sub) %>%
    filter(!is.na(Baseline))
## Joining, by = "Id"
gfr_sub %>%
    distinct(Id, Baseline) %>%
    ggplot(aes(x = Baseline)) + geom_histogram() + geom_vline(xintercept = 90) + 
    theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gfr_sub2 <- gfr_sub %>%
    # filter(Baseline > 60) %>%
mutate(egfrChg = eGFR_cre - Baseline) %>%
    mutate(egfrPer = (eGFR_cre - Baseline)/Baseline) %>%
    select(Id, Months, egfrChg, egfrPer, eGFR_cre, Baseline, Sex, Pregnant, PregnantYes, 
        Season, SeasonRainy, Age, Monday, Weight..kg., Visit) %>%
    arrange(Id, Months)

Data Exploration

Measurement day v. eGFR

  • X-axis: Monday vs. Other Measurement Days
  • Y-axis: Subject specific z-score of eGFR

Observation: Individuals have slightly lower eGFR for their normal on Monday than other days.

# Monday vs. Other Days
gfr_sub2 %>%
    group_by(Id) %>%
    mutate(zegfr = scale(eGFR_cre)) %>%
    ggplot(aes(x = Monday, y = zegfr)) + geom_boxplot() + ylab("eGFR Subject Z Score") + 
    theme_classic()

Pregnancy v. eGFR among Females

  • X-axis: Pregnant vs. Not Pregnant
  • Y-axis: Subject specific z-score of eGFR

Observation: Individuals have higher eGFR in comparison to their normal when pregnant.

gfr_sub2 %>%
    filter(Sex == "Female") %>%
    group_by(Id) %>%
    mutate(zegfr = scale(eGFR_cre)) %>%
    ggplot(aes(x = Pregnant, y = zegfr)) + geom_boxplot() + ylab("eGFR Subject Z Score") + 
    theme_classic()

Weight v. eGFR

  • X-axis: Weight in kg
  • Y-axis: Subject specific z-score of eGFR

Observation: No relationship between weight and eGFR.

gfr_sub2 %>%
    group_by(Id) %>%
    mutate(zegfr = scale(eGFR_cre)) %>%
    ggplot(aes(x = Weight..kg., y = zegfr, color = Sex)) + geom_point() + xlab("Weight (kg)") + 
    ylab("eGFR Subject Z Score") + theme_classic()

  • X-axis: Subject specific z-score of weight
  • Y-axis: Subject specific z-score of eGFR

Observation: No relationship between weight and eGFR.

gfr_sub2 %>%
    group_by(Id) %>%
    mutate(zweight = scale(Weight..kg.)) %>%
    mutate(zegfr = scale(eGFR_cre)) %>%
    ggplot(aes(x = zweight, y = zegfr, color = Sex)) + geom_point() + xlab("Weight Subject Z Score") + 
    ylab("eGFR Subject Z Score") + theme_classic()

Season

  • X-axis: Rainy vs. Dry Season
  • Y-axis: Subject specific z-score of eGFR

Observation: Individuals have higher eGFR in comparison to their normal during Dry Season

gfr_sub2 %>%
    group_by(Id) %>%
    mutate(zegfr = scale(eGFR_cre)) %>%
    ggplot(aes(x = Season, y = zegfr)) + geom_boxplot() + ylab("eGFR Subject Z Score") + 
    theme_classic()

gfr_sub2 %>%
    ggplot(aes(x = Months, y = egfrPer, group = Id)) + geom_line() + theme_classic()

gfr_sub2 %>%
    ggplot(aes(x = Months, y = egfrChg, group = Id)) + geom_line() + theme_classic()

Analytical Sample

  • 301 individuals with at 4 or more eGFR observations and baseline measurements
gfr_sub2 %>%
    filter(Visit == 1) %>%
    nrow()
## [1] 303
gfr_sub2 %>%
    filter(Visit == 1) %>%
    count(Sex)
## # A tibble: 2 x 2
##   Sex        n
##   <chr>  <int>
## 1 Female    75
## 2 Male     228
gfr_sub2 %>%
    filter(Visit == 1) %>%
    pull(Age) %>%
    summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   21.00   24.00   23.71   27.00   30.00
gfr_sub2 <- gfr_sub2 %>%
    mutate(Years = Months/12) %>%
    mutate(eGFRCat = factor(case_when(eGFR_cre > 90 ~ "Normal", eGFR_cre <= 90 ~ 
        "Low"))) %>%
    mutate(eGFRPerCat = factor(case_when(egfrPer > -0.1 ~ "Normal", egfrPer <= -0.1 ~ 
        "Low")))
  • Outliers
gfr_sub2 %>%
    filter(Id == 172) %>%
    ggplot(aes(x = Years, y = eGFR_cre)) + geom_line() + theme_classic()

gfr_sub2 %>%
    filter(Id == 121) %>%
    ggplot(aes(x = Years, y = eGFR_cre)) + geom_line() + theme_classic()

gfr_sub2 %>%
    filter(Id == 212) %>%
    ggplot(aes(x = Years, y = eGFR_cre)) + geom_line() + theme_classic()

gfr_sub2 <- gfr_sub2 %>%
    filter(!(Id %in% c(172, 121)))

#Model Estimation

Difference HMM

Outcome Assumptions

  • Outcome: eGFR Difference from Baseline
  • Baseline: Non-pregnant eGFR’s from Visit 1

State Assumptions

  • Healthy State: Difference from Baseline ~ a0 where a0 is initially assumed N(0, 5^2)
  • Unhealthy State: Difference from Baseline ~ b0 where b0 is fixed at N(-30, 8^2)

Transition Assumptions

Transition intensity from state r to s is the e instantaneous risk of moving from state r to state s, modeled as

\[q_{rs}(z) = q_{rs}^{(0)}\exp(\beta^T_{rs}z)\]

and \(q_{rr} = -\sum_{s\not= r}q_{rs}\).

Constraints

  • We expect about 5% of this population to start in the Unhealthy State and 95% starts in Healthy State
  • Transition intensities and probabilities are independent of past state history (Markov assumption)
  • Absolute value of eGFR (Normal > 90, Low <= 90) impact transition intensities
library("msm")

hmodel1 <- list(hmmNorm(mean = 0, sd = 5), hmmNorm(mean = -30, sd = 8))
mod1 <- msm(egfrChg ~ Years, subject = Id, data = gfr_sub2, qmatrix = rbind(c(0, 
    0.01), c(0.01, 0)), hmodel = hmodel1, covariates = ~eGFRCat, initprobs = c(0.95, 
    0.05), control = list(reltol = 1e-25, maxit = 10000), fixedpars = c(7, 8))

Fitted Model

Baseline Transition Intensities = \(q_{rs}^{(0)}\)

Hazard ratios = \(\exp(\beta_{rs})\)

Parameters

State 1: Healthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state

State 2: Unhealthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
mod1
## 
## Call:
## msm(formula = egfrChg ~ Years, subject = Id, data = gfr_sub2,     qmatrix = rbind(c(0, 0.01), c(0.01, 0)), hmodel = hmodel1,     covariates = ~eGFRCat, initprobs = c(0.95, 0.05), fixedpars = c(7,         8), control = list(reltol = 1e-25, maxit = 10000))
## 
## Maximum likelihood estimates
## Baselines are with covariates set to their means
## 
## Transition intensities with hazard ratios for each covariate
##                   Baseline                     eGFRCatNormal        
## State 1 - State 1 -0.09873 (-0.12646,-0.07708)                      
## State 1 - State 2  0.09873 ( 0.07708, 0.12646) 0.8167 (0.4068,1.640)
## State 2 - State 1  0.20529 ( 0.11602, 0.36324) 2.4352 (0.8296,7.148)
## State 2 - State 2 -0.20529 (-0.36324,-0.11602)                      
## 
## Hidden Markov model, 2 states
## State 1 - normal distribution
## Parameters: 
##       Estimate       LCL       UCL
## mean -3.033914 -3.405789 -2.662039
## sd    7.603481  7.325144  7.892394
## 
## State 2 - normal distribution
## Parameters: 
##      Estimate LCL UCL
## mean      -30  NA  NA
## sd          8  NA  NA
## 
## 
## -2 * log-likelihood:  16940.57 
## [Note, to obtain old print format, use "printold.msm"]

Estimated probability of going from state r to s in the period of t years for given covariate values

library(purrr)
library(tidyr)
library(tibble)

dat <- crossing(t = 0:5, A = 0:1)


tranprobs <- function(x, t, A) {
    as.vector(pmatrix.msm(x = x, t = t, covariates = list(eGFRCatNormal = A)))
}
dat %>%
    select(t, A) %>%
    pmap(.f = tranprobs, x = mod1) %>%
    enframe() %>%
    mutate(transition = list(c("State 1 to State 1", "State 2 to State 1", "State 1 to State 2", 
        "State 2 to State 2"))) %>%
    bind_cols(dat %>%
        select(t, A)) %>%
    unnest() %>%
    rename(prob = value) %>%
    mutate(eGFR = case_when(A == 1 ~ "Normal eGFR", TRUE ~ "Low eGFR")) %>%
    # mutate(Sex = case_when(B == 1 ~ 'Male', TRUE ~'Female')) %>%
ggplot(aes(x = t, y = prob, color = transition, group = transition)) + geom_line() + 
    facet_grid(. ~ eGFR) + ylim(c(0, 1)) + labs(y = "Probabilitiy of Transition in Time Period", 
    x = "Time Period (Years)", color = "Transition") + theme_classic()
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(value, transition)`

hazard.msm(mod1)  #hazard ratios with CI
## $eGFRCatNormal
##                          HR         L        U
## State 1 - State 2 0.8167364 0.4068397 1.639609
## State 2 - State 1 2.4351988 0.8296341 7.147962

Percent Change HMM

Outcome Assumptions

  • Outcome: Percent Change in eGFR from Baseline (eGFR - Baseline)/Baseline
  • Baseline: Non-pregnant eGFR’s from Visit 1

State Assumptions

  • Healthy State: Percent Change from Baseline ~ a0 where a0 is initially assumed N(0, 0.1^2)
  • Unhealthy State: Percent Change from Baseline ~ b0 where b0 is fixed N(-0.3, 0.1^2)

Transition Assumptions

Transition intensity from state r to s is the e instantaneous risk of moving from state r to state s, modeled as

\[q_{rs}(z) = q_{rs}^{(0)}\exp(\beta^T_{rs}z)\]

and \(q_{rr} = -\sum_{s\not= r}q_{rs}\).

Constraints

  • We expect about 5% of this population to start in the Unhealthy State (eGFR < 90) and 95% starts in Healthy State (eGFR > 90)
  • Transition intensities and probabilities are independent of past state history (Markov assumption)
  • Absolute value of eGFR (Normal > 90, Low <= 90) impact transition intensities
library("msm")

hmodel2 <- list(hmmNorm(mean = 0, sd = 0.1), hmmNorm(mean = -0.3, sd = 0.1))
mod2 <- msm(egfrPer ~ Years, subject = Id, data = gfr_sub2, qmatrix = rbind(c(0, 
    0.01), c(0.01, 0)), hmodel = hmodel2, covariates = ~eGFRCat, initprobs = c(0.95, 
    0.05), control = list(reltol = 1e-25, maxit = 10000), fixedpars = c(7, 8))

Fitted Model

Baseline Transition Intensities = \(q_{rs}^{(0)}\)

Hazard ratios = \(\exp(\beta_{rs})\)

Parameters

State 1: Healthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state

State 2: Unhealthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
mod2
## 
## Call:
## msm(formula = egfrPer ~ Years, subject = Id, data = gfr_sub2,     qmatrix = rbind(c(0, 0.01), c(0.01, 0)), hmodel = hmodel2,     covariates = ~eGFRCat, initprobs = c(0.95, 0.05), fixedpars = c(7,         8), control = list(reltol = 1e-25, maxit = 10000))
## 
## Maximum likelihood estimates
## Baselines are with covariates set to their means
## 
## Transition intensities with hazard ratios for each covariate
##                   Baseline                     eGFRCatNormal         
## State 1 - State 1 -0.07623 (-0.09973,-0.05827)                       
## State 1 - State 2  0.07623 ( 0.05827, 0.09973) 0.2596 (0.1434,0.4699)
## State 2 - State 1  0.11207 ( 0.03983, 0.31539) 1.1354 (0.2437,5.2887)
## State 2 - State 2 -0.11207 (-0.31539,-0.03983)                       
## 
## Hidden Markov model, 2 states
## State 1 - normal distribution
## Parameters: 
##         Estimate         LCL         UCL
## mean -0.02542415 -0.02902379 -0.02182450
## sd    0.07318805  0.07051590  0.07596146
## 
## State 2 - normal distribution
## Parameters: 
##      Estimate LCL UCL
## mean     -0.3  NA  NA
## sd        0.1  NA  NA
## 
## 
## -2 * log-likelihood:  -4150.321 
## [Note, to obtain old print format, use "printold.msm"]

Estimate probability of going from state r to s in the period of t years for given covariate values

library(purrr)
library(tidyr)
library(tibble)

dat <- crossing(t = 0:5, A = 0:1)


tranprobs <- function(x, t, A) {
    as.vector(pmatrix.msm(x = x, t = t, covariates = list(eGFRCatNormal = A)))
}
dat %>%
    select(t, A) %>%
    pmap(.f = tranprobs, x = mod2) %>%
    enframe() %>%
    mutate(transition = list(c("State 1 to State 1", "State 2 to State 1", "State 1 to State 2", 
        "State 2 to State 2"))) %>%
    bind_cols(dat %>%
        select(t, A)) %>%
    unnest() %>%
    rename(prob = value) %>%
    mutate(eGFR = case_when(A == 1 ~ "Normal eGFR", TRUE ~ "Low eGFR")) %>%
    # mutate(Sex = case_when(B == 1 ~ 'Male', TRUE ~'Female')) %>%
ggplot(aes(x = t, y = prob, color = transition, group = transition)) + geom_line() + 
    facet_grid(. ~ eGFR) + ylim(c(0, 1)) + labs(y = "Probabilitiy of Transition in Time Period", 
    x = "Time Period (Years)", color = "Transition") + theme_classic()
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(value, transition)`

hazard.msm(mod2)  #hazard ratios with CI
## $eGFRCatNormal
##                          HR         L         U
## State 1 - State 2 0.2595931 0.1434145 0.4698868
## State 2 - State 1 1.1353537 0.2437311 5.2887307

Raw Value HMM

Outcome Assumptions

  • Outcome: eGFR

State Assumptions

  • Healthy State: eGFR ~ a0 + a1PregnantYes + a2SeasonRainy where a0 is initially assumed Norm(120, 12^2) but estimated
  • Unhealthy State: eGFR ~ b0 + a1PregnantYes + b2SeasonRainy where b0 is assumed Norm(60, 15^2) but estimated

Transition Assumptions

Transition intensity from state r to s is the e instantaneous risk of moving from state r to state s, modeled as

\[q_{rs}(z) = q_{rs}^{(0)}\exp(\beta^T_{rs}z)\]

and \(q_{rr} = -\sum_{s\not= r}q_{rs}\).

Constraints

  • We expect about 5% of this population to start in the Unhealthy State (eGFR < 90) and 95% starts in Healthy State (eGFR > 90)
  • Transition intensities and probabilities are independent of past state history (Markov assumption)
library("msm")

hmodel3 <- list(hmmNorm(mean = 120, sd = 12), hmmNorm(mean = 60, 15))

mod3 <- msm(eGFR_cre ~ Years, subject = Id, data = gfr_sub2, qmatrix = rbind(c(0, 
    0.01), c(0.01, 0)), hmodel = hmodel3, hcovariates = list(~PregnantYes + SeasonRainy, 
    ~PregnantYes + SeasonRainy), hconstraint = list(PregnantYes = c(1, 1)), initprobs = c(0.95, 
    0.05), control = list(reltol = 1e-25, maxit = 10000), center = FALSE)  #,fixedpars=c(5,6)

Fitted Model

Baseline Transition Intensities = \(q_{rs}^{(0)}\)

Hazard ratios = \(\exp(\beta_{rs})\)

Parameters

State 1: Healthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasons

State 2: Unhealthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasons
mod3
## 
## Call:
## msm(formula = eGFR_cre ~ Years, subject = Id, data = gfr_sub2,     qmatrix = rbind(c(0, 0.01), c(0.01, 0)), hmodel = hmodel3,     hcovariates = list(~PregnantYes + SeasonRainy, ~PregnantYes +         SeasonRainy), hconstraint = list(PregnantYes = c(1, 1)),     initprobs = c(0.95, 0.05), center = FALSE, control = list(reltol = 1e-25,         maxit = 10000))
## 
## Maximum likelihood estimates
## Baselines are with covariates set to 0
## 
## Transition intensities
##                   Baseline                      
## State 1 - State 1 -0.07321 (-0.099392,-0.053929)
## State 1 - State 2  0.07321 ( 0.053929, 0.099392)
## State 2 - State 1  0.00815 ( 0.001338, 0.049651)
## State 2 - State 2 -0.00815 (-0.049651,-0.001338)
## 
## Hidden Markov model, 2 states
## State 1 - normal distribution
## Parameters: 
##               Estimate        LCL         UCL
## mean        121.911144 121.004627 122.8176621
## sd            9.066679   8.608588   9.5491469
## PregnantYes  13.418100   7.933667  18.9025323
## SeasonRainy  -1.615515  -2.679358  -0.5516715
## 
## State 2 - normal distribution
## Parameters: 
##              Estimate        LCL       UCL
## mean        78.489625 73.9837877 82.995461
## sd          23.514248 22.1634408 24.947384
## PregnantYes 13.418100  7.9336669 18.902532
## SeasonRainy  4.506299 -0.3606806  9.373279
## 
## 
## -2 * log-likelihood:  17816.77 
## [Note, to obtain old print format, use "printold.msm"]

Estimate probability of going from state r to s in the period of t months for given covariate values

library(purrr)
library(tidyr)
library(tibble)

dat <- crossing(t = 0:3, A = 0:1)


tranprobs <- function(x, t, A) {
    as.vector(pmatrix.msm(x = x, t = t, covariates = list(eGFRPerCatNormal = A)))
}
dat %>%
    select(t, A) %>%
    pmap(.f = tranprobs, x = mod3) %>%
    enframe() %>%
    mutate(transition = list(c("State 1 to State 1", "State 2 to State 1", "State 1 to State 2", 
        "State 2 to State 2"))) %>%
    bind_cols(dat %>%
        select(t, A)) %>%
    unnest() %>%
    rename(prob = value) %>%
    mutate(eGFR = case_when(A == 1 ~ "Normal eGFR Change", TRUE ~ "High eGFR Decline")) %>%
    # mutate(Sex = case_when(B == 1 ~ 'Male', TRUE ~'Female')) %>%
ggplot(aes(x = t, y = prob, color = transition, group = transition)) + geom_line() + 
    facet_grid(. ~ eGFR) + ylim(c(0, 1)) + labs(y = "Probabilitiy of Transition in Time Period", 
    x = "Time Period (Years)", color = "Transition") + theme_classic()

hazard.msm(mod3)  #hazard ratios with CI
## [1] "No covariates on transition intensities"

Raw Value + Percent Change HMM (Estimated)

Outcome Assumptions

  • Outcome: eGFR

State Assumptions

  • Healthy State: eGFR ~ a0 + a1PregnantYes + a2SeasonRainy where a0 is initially assumed Norm(120, 12^2) but estimated
  • Unhealthy State: eGFR ~ b0 + a1PregnantYes + b2SeasonRainy where b0 is assumed Norm(60, 15^2) but estimated

Transition Assumptions

Transition intensity from state r to s is the e instantaneous risk of moving from state r to state s, modeled as

\[q_{rs}(z) = q_{rs}^{(0)}\exp(\beta^T_{rs}z)\]

and \(q_{rr} = -\sum_{s\not= r}q_{rs}\).

Constraints

  • We expect about 5% of this population to start in the Unhealthy State (eGFR < 90) and 95% starts in Healthy State (eGFR > 90)
  • We let transition intensities be impacted by percent change from baseline (Normal: + change or less then 10% decline from baseline, Big Decline: More then 10% decline from baseline)
  • Transition intensities and probabilities are independent of past state history (Markov assumption)
library("msm")

hmodel4 <- list(hmmNorm(mean = 120, sd = 12), hmmNorm(mean = 60, sd = 15))

mod4 <- msm(eGFR_cre ~ Years, subject = Id, data = gfr_sub2, qmatrix = rbind(c(0, 
    0.01), c(0.01, 0)), hmodel = hmodel4, hcovariates = list(~PregnantYes + SeasonRainy, 
    ~PregnantYes + SeasonRainy), hconstraint = list(PregnantYes = c(1, 1)), covariates = ~eGFRPerCat, 
    initprobs = c(0.95, 0.05), control = list(reltol = 1e-25, maxit = 10000), center = FALSE)

Fitted Model

Baseline Transition Intensities = \(q_{rs}^{(0)}\)

Hazard ratios = \(\exp(\beta_{rs})\)

Parameters

State 1: Healthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasons

State 2: Unhealthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasons
mod4
## 
## Call:
## msm(formula = eGFR_cre ~ Years, subject = Id, data = gfr_sub2,     qmatrix = rbind(c(0, 0.01), c(0.01, 0)), hmodel = hmodel4,     covariates = ~eGFRPerCat, hcovariates = list(~PregnantYes +         SeasonRainy, ~PregnantYes + SeasonRainy), hconstraint = list(PregnantYes = c(1,         1)), initprobs = c(0.95, 0.05), center = FALSE, control = list(reltol = 1e-25,         maxit = 10000))
## 
## Maximum likelihood estimates
## Baselines are with covariates set to 0
## 
## Transition intensities with hazard ratios for each covariate
##                   Baseline                       eGFRPerCatNormal        
## State 1 - State 1 -0.15471 (-0.297238,-0.080521)                         
## State 1 - State 2  0.15471 ( 0.080521, 0.297238) 3.956e-01 (0.179,0.8742)
## State 2 - State 1  0.02105 ( 0.004787, 0.092545) 5.689e-07 (0.000,   Inf)
## State 2 - State 2 -0.02105 (-0.092545,-0.004787)                         
## 
## Hidden Markov model, 2 states
## State 1 - normal distribution
## Parameters: 
##               Estimate        LCL         UCL
## mean        121.814771 120.899092 122.7304490
## sd            9.139974   8.688090   9.6153608
## PregnantYes  13.512675   7.985020  19.0403289
## SeasonRainy  -1.609828  -2.669647  -0.5500089
## 
## State 2 - normal distribution
## Parameters: 
##              Estimate        LCL       UCL
## mean        77.860355 73.3378509 82.382860
## sd          23.444099 22.0848117 24.887048
## PregnantYes 13.512675  7.9850202 19.040329
## SeasonRainy  4.711851 -0.1885915  9.612294
## 
## 
## -2 * log-likelihood:  17810.79 
## [Note, to obtain old print format, use "printold.msm"]

Estimate probability of going from state r to s in the period of t months for given covariate values

library(purrr)
library(tidyr)
library(tibble)

dat <- crossing(t = seq(0, 3, by = 0.5), A = 0:1)


tranprobs <- function(x, t, A) {
    as.vector(pmatrix.msm(x = x, t = t, covariates = list(eGFRPerCatNormal = A)))
}
dat %>%
    select(t, A) %>%
    pmap(.f = tranprobs, x = mod4) %>%
    enframe() %>%
    mutate(transition = list(c("State 1 to State 1", "State 2 to State 1", "State 1 to State 2", 
        "State 2 to State 2"))) %>%
    bind_cols(dat %>%
        select(t, A)) %>%
    unnest() %>%
    rename(prob = value) %>%
    mutate(eGFR = case_when(A == 1 ~ "Normal eGFR Change", TRUE ~ "High eGFR Decline")) %>%
    # mutate(Sex = case_when(B == 1 ~ 'Male', TRUE ~'Female')) %>%
ggplot(aes(x = t, y = prob, color = transition, group = transition)) + geom_line() + 
    facet_grid(. ~ eGFR) + ylim(c(0, 1)) + labs(y = "Probabilitiy of Transition in Time Period", 
    x = "Time Period (Years)", color = "Transition") + theme_classic()

hazard.msm(mod4)  #hazard ratios with CI
## $eGFRPerCatNormal
##                             HR         L         U
## State 1 - State 2 3.955948e-01 0.1790104 0.8742245
## State 2 - State 1 5.688522e-07 0.0000000       Inf

Raw Value + Percent Change HMM (Fixed Unhealthy State)

Outcome Assumptions

  • Outcome: eGFR

State Assumptions

  • Healthy State: eGFR ~ a0 + a1PregnantYes + a2SeasonRainy where a0 is initially assumed Norm(120, 12^2) but estimated
  • Unhealthy State: eGFR ~ b0 + a1PregnantYes + b2SeasonRainy where b0 is assumed Norm(70, 15^2) but fixed Transition Assumptions

Transition intensity from state r to s is the e instantaneous risk of moving from state r to state s, modeled as

\[q_{rs}(z) = q_{rs}^{(0)}\exp(\beta^T_{rs}z)\]

and \(q_{rr} = -\sum_{s\not= r}q_{rs}\).

Constraints

  • We expect about 5% of this population to start in the Unhealthy State (eGFR < 90) and 95% starts in Healthy State (eGFR > 90)
  • We let transition intensities be impacted by percent change from baseline (Normal: + change or less then 10% decline from baseline, Big Decline: More then 10% decline from baseline)
  • Transition intensities and probabilities are independent of past state history (Markov assumption)
library("msm")

hmodel5 <- list(hmmNorm(mean = 120, sd = 12), hmmNorm(mean = 70, sd = 15))

mod5 <- msm(eGFR_cre ~ Years, subject = Id, data = gfr_sub2, qmatrix = rbind(c(0, 
    0.01), c(0.01, 0)), hmodel = hmodel5, hcovariates = list(~PregnantYes + SeasonRainy, 
    ~PregnantYes + SeasonRainy), hconstraint = list(PregnantYes = c(1, 1)), covariates = ~eGFRPerCat, 
    initprobs = c(0.95, 0.05), control = list(reltol = 1e-25, maxit = 10000), fixedpars = c(7, 
        8), center = FALSE)

Fitted Model

Baseline Transition Intensities = \(q_{rs}^{(0)}\)

Hazard ratios = \(\exp(\beta_{rs})\)

Parameters

State 1: Healthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasonsi

State 2: Unhealthy

  • mean: estimated intercept
  • sd: estimated variability of outcome in state
  • PregnantYes: estimated difference in mean eGFR between Pregnant and non-Pregnant observations (constrained to be the same for both states)
  • SeasonRainy: estimated difference in mean eGFR between Rainy and Dry Seasons
mod5
## 
## Call:
## msm(formula = eGFR_cre ~ Years, subject = Id, data = gfr_sub2,     qmatrix = rbind(c(0, 0.01), c(0.01, 0)), hmodel = hmodel5,     covariates = ~eGFRPerCat, hcovariates = list(~PregnantYes +         SeasonRainy, ~PregnantYes + SeasonRainy), hconstraint = list(PregnantYes = c(1,         1)), initprobs = c(0.95, 0.05), fixedpars = c(7, 8),     center = FALSE, control = list(reltol = 1e-25, maxit = 10000))
## 
## Maximum likelihood estimates
## Baselines are with covariates set to 0
## 
## Transition intensities with hazard ratios for each covariate
##                   Baseline                     eGFRPerCatNormal         
## State 1 - State 1 -0.13315 (-0.22648,-0.07828)                          
## State 1 - State 2  0.13315 ( 0.07828, 0.22648) 3.268e-01 (0.1673,0.6385)
## State 2 - State 1  0.05451 ( 0.02411, 0.12322) 4.627e-06 (0.0000,   Inf)
## State 2 - State 2 -0.05451 (-0.12322,-0.02411)                          
## 
## Hidden Markov model, 2 states
## State 1 - normal distribution
## Parameters: 
##               Estimate        LCL         UCL
## mean        120.983101 120.025993 121.9402087
## sd            9.923965   9.517420  10.3478762
## PregnantYes  14.368859   8.473357  20.2643603
## SeasonRainy  -1.856296  -2.952367  -0.7602244
## 
## State 2 - normal distribution
## Parameters: 
##              Estimate      LCL       UCL
## mean        70.000000       NA        NA
## sd          15.000000       NA        NA
## PregnantYes 14.368859 8.473357 20.264360
## SeasonRainy  6.809362 4.649229  8.969496
## 
## 
## -2 * log-likelihood:  18069.84 
## [Note, to obtain old print format, use "printold.msm"]

Estimate probability of going from state r to s in the period of t months for given covariate values

library(purrr)
library(tidyr)
library(tibble)

dat <- crossing(t = seq(0, 3, by = 0.5), A = 0:1)


tranprobs <- function(x, t, A) {
    as.vector(pmatrix.msm(x = x, t = t, covariates = list(eGFRPerCatNormal = A)))
}
dat %>%
    select(t, A) %>%
    pmap(.f = tranprobs, x = mod5) %>%
    enframe() %>%
    mutate(transition = list(c("State 1 to State 1", "State 2 to State 1", "State 1 to State 2", 
        "State 2 to State 2"))) %>%
    bind_cols(dat %>%
        select(t, A)) %>%
    unnest() %>%
    rename(prob = value) %>%
    mutate(eGFR = case_when(A == 1 ~ "Normal eGFR Change", TRUE ~ "High eGFR Decline")) %>%
    # mutate(Sex = case_when(B == 1 ~ 'Male', TRUE ~'Female')) %>%
ggplot(aes(x = t, y = prob, color = transition, group = transition)) + geom_line() + 
    facet_grid(. ~ eGFR) + ylim(c(0, 1)) + labs(y = "Probabilitiy of Transition in Time Period", 
    x = "Time Period (Years)", color = "Transition") + theme_classic()

hazard.msm(mod5)  #hazard ratios with CI
## $eGFRPerCatNormal
##                             HR        L         U
## State 1 - State 2 3.268474e-01 0.167305 0.6385298
## State 2 - State 1 4.627078e-06 0.000000       Inf

Model Comparison

Exploration Shiny App

Comparison of predicted States for each time across all individuals at https://bheggeseth.shinyapps.io/KidneyModelExploration/

tmp1 <- viterbi.msm(mod1)
tmp1$State <- apply(tmp1$pstate, 1, which.max)
tmp2 <- viterbi.msm(mod2)
tmp2$State <- apply(tmp2$pstate, 1, which.max)
tmp3 <- viterbi.msm(mod3)
tmp3$State <- apply(tmp3$pstate, 1, which.max)
tmp4 <- viterbi.msm(mod4)
tmp4$State <- apply(tmp4$pstate, 1, which.max)
tmp5 <- viterbi.msm(mod5)
tmp5$State <- apply(tmp5$pstate, 1, which.max)

# tmp1 <- viterbi.own(mod1,gfr_sub2) tmp2 <- viterbi.own(mod2,gfr_sub2) tmp3 <-
# viterbi.own(mod3,gfr_sub2) tmp4 <- viterbi.own(mod4,gfr_sub2) tmp5 <-
# viterbi.own(mod5,gfr_sub2)
## `summarise()` has grouped output by 'Id', 'Baseline'. You can override using the `.groups` argument.
## Joining, by = "Id"

Model 1 Visualizations

Visualizations of subject-specific trajectorires

  • Color reflects state with max posterior probability
  • Transparency reflects posterior probability/certainty in state membership (dark = 1, light = 0)
vizTraj <- function(SBJ, tmp, title) {
    tmp$Certainty <- apply(tmp$pstate, 1, max)
    tmp %>%
        mutate(State = factor(fitted)) %>%
        filter(subject %in% SBJ) %>%
        left_join(gfr_sub2 %>%
            select(Id, Years, eGFR_cre, Pregnant, Sex), by = c(subject = "Id", time = "Years")) %>%
        ggplot(aes(x = time, y = eGFR_cre, group = subject, color = State)) + geom_line() + 
        geom_hline(yintercept = c(60, 90), color = "lightgrey", linetype = 3) + geom_point(aes(shape = Pregnant, 
        alpha = Certainty)) + facet_wrap(Sex ~ subject) + scale_alpha(limits = c(0, 
        1)) + xlab("Years of Follow Up") + ylab("eGFR") + ylim(c(0, 150)) + scale_color_discrete("State", 
        limits = factor(2:1), labels = c("Unhealthy", "Healthy")) + ggtitle(title) + 
        theme_classic() + theme(legend.position = "top") + guides(alpha = FALSE)
}

i = 1

vizTraj(1:30, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 1-30")) %>%
    print()

vizTraj(31:60, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 31-60")) %>%
    print()

vizTraj(61:90, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 61-90")) %>%
    print()

vizTraj(91:120, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 91-120")) %>%
    print()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(121:150, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 121-150")) %>%
    print()

vizTraj(151:180, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 151-180")) %>%
    print()
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(181:210, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 181-210")) %>%
    print()

vizTraj(211:230, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 211-230")) %>%
    print()

Model 2 Visualizations

Visualizations of subject-specific trajectorires

  • Color reflects state with max posterior probability
  • Transparency reflects posterior probability/certainty in state membership (dark = 1, light = 0)
vizTraj <- function(SBJ, tmp, title) {
    tmp$Certainty <- apply(tmp$pstate, 1, max)
    tmp %>%
        mutate(State = factor(fitted)) %>%
        filter(subject %in% SBJ) %>%
        left_join(gfr_sub2 %>%
            select(Id, Years, eGFR_cre, Pregnant, Sex), by = c(subject = "Id", time = "Years")) %>%
        ggplot(aes(x = time, y = eGFR_cre, group = subject, color = State)) + geom_line() + 
        geom_hline(yintercept = c(60, 90), color = "lightgrey", linetype = 3) + geom_point(aes(shape = Pregnant, 
        alpha = Certainty)) + facet_wrap(Sex ~ subject) + scale_alpha(limits = c(0, 
        1)) + xlab("Years of Follow Up") + ylab("eGFR") + ylim(c(0, 150)) + scale_color_discrete("State", 
        limits = factor(2:1), labels = c("Unhealthy", "Healthy")) + ggtitle(title) + 
        theme_classic() + theme(legend.position = "top") + guides(alpha = FALSE)
}

i = 2
vizTraj(1:30, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 1-30")) %>%
    print()

vizTraj(31:60, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 31-60")) %>%
    print()

vizTraj(61:90, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 61-90")) %>%
    print()

vizTraj(91:120, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 91-120")) %>%
    print()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(121:150, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 121-150")) %>%
    print()

vizTraj(151:180, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 151-180")) %>%
    print()
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(181:210, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 181-210")) %>%
    print()

vizTraj(211:230, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 211-230")) %>%
    print()

Model 3 Visualizations

Visualizations of subject-specific trajectorires

  • Color reflects state with max posterior probability
  • Transparency reflects posterior probability/certainty in state membership (dark = 1, light = 0)
vizTraj <- function(SBJ, tmp, title) {
    tmp$Certainty <- apply(tmp$pstate, 1, max)
    tmp %>%
        mutate(State = factor(fitted)) %>%
        filter(subject %in% SBJ) %>%
        left_join(gfr_sub2 %>%
            select(Id, Years, eGFR_cre, Pregnant, Sex), by = c(subject = "Id", time = "Years")) %>%
        ggplot(aes(x = time, y = eGFR_cre, group = subject, color = State)) + geom_line() + 
        geom_hline(yintercept = c(60, 90), color = "lightgrey", linetype = 3) + geom_point(aes(shape = Pregnant, 
        alpha = Certainty)) + facet_wrap(Sex ~ subject) + scale_alpha(limits = c(0, 
        1)) + xlab("Years of Follow Up") + ylab("eGFR") + ylim(c(0, 150)) + scale_color_discrete("State", 
        limits = factor(2:1), labels = c("Unhealthy", "Healthy")) + ggtitle(title) + 
        theme_classic() + theme(legend.position = "top") + guides(alpha = FALSE)
}

i = 3
vizTraj(1:30, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 1-30")) %>%
    print()

vizTraj(31:60, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 31-60")) %>%
    print()

vizTraj(61:90, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 61-90")) %>%
    print()

vizTraj(91:120, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 91-120")) %>%
    print()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(121:150, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 121-150")) %>%
    print()

vizTraj(151:180, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 151-180")) %>%
    print()
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(181:210, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 181-210")) %>%
    print()

vizTraj(211:230, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 211-230")) %>%
    print()

Model 4 Visualizations

Visualizations of subject-specific trajectorires

  • Color reflects state with max posterior probability
  • Transparency reflects posterior probability/certainty in state membership (dark = 1, light = 0)
vizTraj <- function(SBJ, tmp, title) {
    tmp$Certainty <- apply(tmp$pstate, 1, max)
    tmp %>%
        mutate(State = factor(fitted)) %>%
        filter(subject %in% SBJ) %>%
        left_join(gfr_sub2 %>%
            select(Id, Years, eGFR_cre, Pregnant, Sex), by = c(subject = "Id", time = "Years")) %>%
        ggplot(aes(x = time, y = eGFR_cre, group = subject, color = State)) + geom_line() + 
        geom_hline(yintercept = c(60, 90), color = "lightgrey", linetype = 3) + geom_point(aes(shape = Pregnant, 
        alpha = Certainty)) + facet_wrap(Sex ~ subject) + scale_alpha(limits = c(0, 
        1)) + xlab("Years of Follow Up") + ylab("eGFR") + ylim(c(0, 150)) + scale_color_discrete("State", 
        limits = factor(2:1), labels = c("Unhealthy", "Healthy")) + ggtitle(title) + 
        theme_classic() + theme(legend.position = "top") + guides(alpha = FALSE)
}

i = 4
vizTraj(1:30, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 1-30")) %>%
    print()

vizTraj(31:60, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 31-60")) %>%
    print()

vizTraj(61:90, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 61-90")) %>%
    print()

vizTraj(91:120, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 91-120")) %>%
    print()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(121:150, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 121-150")) %>%
    print()

vizTraj(151:180, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 151-180")) %>%
    print()
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(181:210, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 181-210")) %>%
    print()

vizTraj(211:230, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 211-230")) %>%
    print()

Model 5 Visualizations

Visualizations of subject-specific trajectorires

  • Color reflects state with max posterior probability
  • Transparency reflects posterior probability/certainty in state membership (dark = 1, light = 0)
vizTraj <- function(SBJ, tmp, title) {
    tmp$Certainty <- apply(tmp$pstate, 1, max)
    tmp %>%
        mutate(State = factor(fitted)) %>%
        filter(subject %in% SBJ) %>%
        left_join(gfr_sub2 %>%
            select(Id, Years, eGFR_cre, Pregnant, Sex), by = c(subject = "Id", time = "Years")) %>%
        ggplot(aes(x = time, y = eGFR_cre, group = subject, color = State)) + geom_line() + 
        geom_hline(yintercept = c(60, 90), color = "lightgrey", linetype = 3) + geom_point(aes(shape = Pregnant, 
        alpha = Certainty)) + facet_wrap(Sex ~ subject) + scale_alpha(limits = c(0, 
        1)) + xlab("Years of Follow Up") + ylab("eGFR") + ylim(c(0, 150)) + scale_color_discrete("State", 
        limits = factor(2:1), labels = c("Unhealthy", "Healthy")) + ggtitle(title) + 
        theme_classic() + theme(legend.position = "top") + guides(alpha = FALSE)
}

i = 5
vizTraj(1:30, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 1-30")) %>%
    print()

vizTraj(31:60, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 31-60")) %>%
    print()

vizTraj(61:90, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 61-90")) %>%
    print()

vizTraj(91:120, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 91-120")) %>%
    print()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(121:150, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 121-150")) %>%
    print()

vizTraj(151:180, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 151-180")) %>%
    print()
## Warning: Removed 1 rows containing missing values (geom_point).

vizTraj(181:210, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 181-210")) %>%
    print()

vizTraj(211:230, get(paste0("tmp", i)), paste0("Model ", i, ": Subjects 211-230")) %>%
    print()